@realtimex/realtimex-alchemy 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/bin/realtimex-alchemy.js +55 -0
- package/dist/api/config/index.js +33 -0
- package/dist/api/index.js +237 -0
- package/dist/api/lib/ContentCleaner.js +114 -0
- package/dist/api/lib/types.js +1 -0
- package/dist/api/services/AlchemistService.js +241 -0
- package/dist/api/services/EventService.js +53 -0
- package/dist/api/services/LibrarianService.js +72 -0
- package/dist/api/services/MinerService.js +314 -0
- package/dist/api/services/ProcessingEventService.js +75 -0
- package/dist/api/services/RouterService.js +40 -0
- package/dist/api/services/SupabaseService.js +49 -0
- package/dist/api/utils/BrowserPathDetector.js +206 -0
- package/dist/api/utils/UrlNormalizer.js +176 -0
- package/dist/api/utils/contentCleaner.js +114 -0
- package/dist/api/utils/contentCleaner.test.js +96 -0
- package/dist/assets/index-7Lemtnxa.css +1 -0
- package/dist/assets/index-CRgCScOz.js +101 -0
- package/dist/email-automator-logo.svg +51 -0
- package/dist/favicon.svg +45 -0
- package/dist/index.html +18 -0
- package/package.json +80 -0
package/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 RealTimeX
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { spawn } from 'child_process';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import { dirname, join } from 'path';
|
|
6
|
+
|
|
7
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
8
|
+
const __dirname = dirname(__filename);
|
|
9
|
+
|
|
10
|
+
const args = process.argv.slice(2);
|
|
11
|
+
|
|
12
|
+
// Default port
|
|
13
|
+
let port = '3012';
|
|
14
|
+
const portIndex = args.indexOf('--port');
|
|
15
|
+
if (portIndex !== -1 && args[portIndex + 1]) {
|
|
16
|
+
port = args[portIndex + 1];
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
console.log('🚀 RealTimeX Alchemy starting...');
|
|
20
|
+
console.log(`📡 Port: ${port}`);
|
|
21
|
+
console.log('');
|
|
22
|
+
|
|
23
|
+
// Path to compiled server
|
|
24
|
+
const serverPath = join(__dirname, '..', 'dist', 'api', 'index.js');
|
|
25
|
+
const distPath = join(__dirname, '..', 'dist');
|
|
26
|
+
|
|
27
|
+
const server = spawn(process.execPath, [serverPath, ...args], {
|
|
28
|
+
stdio: 'inherit',
|
|
29
|
+
env: {
|
|
30
|
+
...process.env,
|
|
31
|
+
PORT: port,
|
|
32
|
+
ELECTRON_STATIC_PATH: distPath
|
|
33
|
+
},
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
server.on('error', (error) => {
|
|
37
|
+
console.error('❌ Failed to start RealTimeX Alchemy:', error.message);
|
|
38
|
+
process.exit(1);
|
|
39
|
+
});
|
|
40
|
+
|
|
41
|
+
server.on('close', (code) => {
|
|
42
|
+
if (code !== 0) {
|
|
43
|
+
console.log(`\n⚠️ RealTimeX Alchemy stopped with code ${code}`);
|
|
44
|
+
}
|
|
45
|
+
process.exit(code || 0);
|
|
46
|
+
});
|
|
47
|
+
|
|
48
|
+
process.on('SIGINT', () => {
|
|
49
|
+
console.log('\n\n⏹️ Shutting down RealTimeX Alchemy...');
|
|
50
|
+
server.kill('SIGINT');
|
|
51
|
+
});
|
|
52
|
+
|
|
53
|
+
process.on('SIGTERM', () => {
|
|
54
|
+
server.kill('SIGTERM');
|
|
55
|
+
});
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
import dotenv from 'dotenv';
|
|
2
|
+
import path from 'path';
|
|
3
|
+
import { fileURLToPath } from 'url';
|
|
4
|
+
dotenv.config();
|
|
5
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
6
|
+
const __dirname = path.dirname(__filename);
|
|
7
|
+
export const CONFIG = {
|
|
8
|
+
PORT: process.env.PORT || 3000,
|
|
9
|
+
OLLAMA_HOST: process.env.OLLAMA_HOST || 'http://localhost:11434',
|
|
10
|
+
OPENAI_API_KEY: process.env.OPENAI_API_KEY,
|
|
11
|
+
ANTHROPIC_API_KEY: process.env.ANTHROPIC_API_KEY,
|
|
12
|
+
AGENT_BROWSER_API_KEY: process.env.AGENT_BROWSER_API_KEY,
|
|
13
|
+
DATA_DIR: path.join(process.cwd(), 'data'),
|
|
14
|
+
MAX_HISTORY_ITEMS: 50,
|
|
15
|
+
RETENTION_DAYS: 30,
|
|
16
|
+
};
|
|
17
|
+
export const BROWSER_PATHS = {
|
|
18
|
+
darwin: {
|
|
19
|
+
chrome: path.join(process.env.HOME || '', 'Library/Application Support/Google/Chrome/Default/History'),
|
|
20
|
+
edge: path.join(process.env.HOME || '', 'Library/Application Support/Microsoft Edge/Default/History'),
|
|
21
|
+
brave: path.join(process.env.HOME || '', 'Library/Application Support/BraveSoftware/Brave-Browser/Default/History'),
|
|
22
|
+
safari: path.join(process.env.HOME || '', 'Library/Safari/History.db'),
|
|
23
|
+
},
|
|
24
|
+
win32: {
|
|
25
|
+
chrome: path.join(process.env.LOCALAPPDATA || '', 'Google/Chrome/User Data/Default/History'),
|
|
26
|
+
edge: path.join(process.env.LOCALAPPDATA || '', 'Microsoft/Edge/User Data/Default/History'),
|
|
27
|
+
brave: path.join(process.env.LOCALAPPDATA || '', 'BraveSoftware/Brave-Browser/User Data/Default/History'),
|
|
28
|
+
},
|
|
29
|
+
linux: {
|
|
30
|
+
chrome: path.join(process.env.HOME || '', '.config/google-chrome/Default/History'),
|
|
31
|
+
brave: path.join(process.env.HOME || '', '.config/BraveSoftware/Brave-Browser/Default/History'),
|
|
32
|
+
}
|
|
33
|
+
};
|
|
@@ -0,0 +1,237 @@
|
|
|
1
|
+
import express from 'express';
|
|
2
|
+
import cors from 'cors';
|
|
3
|
+
import path from 'path';
|
|
4
|
+
import { fileURLToPath } from 'url';
|
|
5
|
+
import { MinerService } from './services/MinerService.js';
|
|
6
|
+
import { AlchemistService } from './services/AlchemistService.js';
|
|
7
|
+
import { LibrarianService } from './services/LibrarianService.js';
|
|
8
|
+
import { CONFIG } from './config/index.js';
|
|
9
|
+
import { EventService } from './services/EventService.js';
|
|
10
|
+
import { SupabaseService } from './services/SupabaseService.js';
|
|
11
|
+
import { BrowserPathDetector } from './utils/BrowserPathDetector.js';
|
|
12
|
+
import { ProcessingEventService } from './services/ProcessingEventService.js';
|
|
13
|
+
import fs from 'fs';
|
|
14
|
+
const __filename = fileURLToPath(import.meta.url);
|
|
15
|
+
const __dirname = path.dirname(__filename);
|
|
16
|
+
const app = express();
|
|
17
|
+
app.use(cors({ origin: '*' }));
|
|
18
|
+
app.use(express.json());
|
|
19
|
+
const miner = new MinerService();
|
|
20
|
+
const alchemist = new AlchemistService();
|
|
21
|
+
const librarian = new LibrarianService();
|
|
22
|
+
const events = EventService.getInstance();
|
|
23
|
+
// Health check
|
|
24
|
+
app.get('/health', (req, res) => {
|
|
25
|
+
res.json({ status: 'active', platform: process.platform });
|
|
26
|
+
});
|
|
27
|
+
// SSE Events
|
|
28
|
+
app.get('/events', (req, res) => {
|
|
29
|
+
res.setHeader('Content-Type', 'text/event-stream');
|
|
30
|
+
res.setHeader('Cache-Control', 'no-cache');
|
|
31
|
+
res.setHeader('Connection', 'keep-alive');
|
|
32
|
+
events.addClient(res);
|
|
33
|
+
req.on('close', () => events.removeClient(res));
|
|
34
|
+
});
|
|
35
|
+
// Helper: Get authenticated Supabase client from request
|
|
36
|
+
function getAuthenticatedSupabase(req) {
|
|
37
|
+
const supabaseUrl = req.headers['x-supabase-url'];
|
|
38
|
+
const supabaseKey = req.headers['x-supabase-key'];
|
|
39
|
+
const authHeader = req.headers['authorization'];
|
|
40
|
+
const accessToken = authHeader && authHeader.startsWith('Bearer ') ? authHeader.slice(7) : undefined;
|
|
41
|
+
if (supabaseUrl && supabaseKey) {
|
|
42
|
+
return SupabaseService.createClient(supabaseUrl, supabaseKey, accessToken);
|
|
43
|
+
}
|
|
44
|
+
if (SupabaseService.isConfigured() && process.env.SUPABASE_URL && process.env.SUPABASE_ANON_KEY) {
|
|
45
|
+
return SupabaseService.createClient(process.env.SUPABASE_URL, process.env.SUPABASE_ANON_KEY, accessToken);
|
|
46
|
+
}
|
|
47
|
+
throw new Error('Supabase Configuration Missing. Please configure in Settings or add .env file.');
|
|
48
|
+
}
|
|
49
|
+
// Get signals
|
|
50
|
+
app.get('/api/signals', async (req, res) => {
|
|
51
|
+
try {
|
|
52
|
+
const supabase = getAuthenticatedSupabase(req);
|
|
53
|
+
const signals = await librarian.getSignals(supabase);
|
|
54
|
+
res.json(signals);
|
|
55
|
+
}
|
|
56
|
+
catch (error) {
|
|
57
|
+
res.status(500).json({ error: error.message });
|
|
58
|
+
}
|
|
59
|
+
});
|
|
60
|
+
// Test/Debug endpoints
|
|
61
|
+
// Trigger Mining (Multi-source)
|
|
62
|
+
app.post('/api/mine', async (req, res) => {
|
|
63
|
+
const processingEvents = ProcessingEventService.getInstance();
|
|
64
|
+
const syncStartTime = Date.now();
|
|
65
|
+
try {
|
|
66
|
+
const supabase = getAuthenticatedSupabase(req);
|
|
67
|
+
// Get settings for the active user using the TOKEN-SCOPED client
|
|
68
|
+
// The token determines which user's data we can see.
|
|
69
|
+
const { data: settings } = await supabase
|
|
70
|
+
.from('alchemy_settings')
|
|
71
|
+
.select('*')
|
|
72
|
+
//.eq('user_id', ...) // RLS handles this implicitly usually, but we pick the first row found
|
|
73
|
+
.limit(1)
|
|
74
|
+
.single();
|
|
75
|
+
if (!settings) {
|
|
76
|
+
throw new Error('Alchemy Engine settings not found. Please save settings in the UI first.');
|
|
77
|
+
}
|
|
78
|
+
const enabledSources = (settings.custom_browser_paths || []).filter((s) => s.enabled);
|
|
79
|
+
// Emit: Sync Starting
|
|
80
|
+
await processingEvents.log({
|
|
81
|
+
eventType: 'info',
|
|
82
|
+
agentState: 'Starting',
|
|
83
|
+
message: 'Sync starting...',
|
|
84
|
+
level: 'info',
|
|
85
|
+
metadata: {
|
|
86
|
+
is_start: true,
|
|
87
|
+
sync_mode: settings.sync_mode || 'incremental',
|
|
88
|
+
max_urls: settings.max_urls_per_sync || 50,
|
|
89
|
+
browser_sources: enabledSources.length,
|
|
90
|
+
browsers: enabledSources.map((s) => s.label).join(', ')
|
|
91
|
+
},
|
|
92
|
+
userId: settings.user_id
|
|
93
|
+
}, supabase);
|
|
94
|
+
console.log('[API] Settings loaded:', {
|
|
95
|
+
id: settings.id,
|
|
96
|
+
has_custom_paths: !!settings.custom_browser_paths,
|
|
97
|
+
raw_paths: settings.custom_browser_paths,
|
|
98
|
+
type: typeof settings.custom_browser_paths
|
|
99
|
+
});
|
|
100
|
+
// 1. Mine History (Extract)
|
|
101
|
+
const history = await miner.mineHistory(settings, supabase);
|
|
102
|
+
// 2. Analyze (Process in background with completion callback)
|
|
103
|
+
if (history.length > 0) {
|
|
104
|
+
alchemist.process(history, settings, supabase, settings.user_id, syncStartTime).catch(err => {
|
|
105
|
+
console.error("Alchemist Error:", err);
|
|
106
|
+
// Emit error completion event
|
|
107
|
+
processingEvents.log({
|
|
108
|
+
eventType: 'error',
|
|
109
|
+
agentState: 'Failed',
|
|
110
|
+
message: `Sync failed: ${err.message}`,
|
|
111
|
+
level: 'error',
|
|
112
|
+
metadata: {
|
|
113
|
+
is_completion: true,
|
|
114
|
+
error: err.message
|
|
115
|
+
},
|
|
116
|
+
userId: settings.user_id
|
|
117
|
+
}, supabase);
|
|
118
|
+
});
|
|
119
|
+
}
|
|
120
|
+
else {
|
|
121
|
+
// No URLs to process - emit completion immediately
|
|
122
|
+
const duration = Date.now() - syncStartTime;
|
|
123
|
+
await processingEvents.log({
|
|
124
|
+
eventType: 'info',
|
|
125
|
+
agentState: 'Completed',
|
|
126
|
+
message: 'Sync completed - no new URLs found',
|
|
127
|
+
level: 'info',
|
|
128
|
+
durationMs: duration,
|
|
129
|
+
metadata: {
|
|
130
|
+
is_completion: true,
|
|
131
|
+
total_urls: 0,
|
|
132
|
+
signals_found: 0,
|
|
133
|
+
skipped: 0,
|
|
134
|
+
errors: 0,
|
|
135
|
+
duration_seconds: Math.round(duration / 1000)
|
|
136
|
+
},
|
|
137
|
+
userId: settings.user_id
|
|
138
|
+
}, supabase);
|
|
139
|
+
}
|
|
140
|
+
res.json({ success: true, history_count: history.length, queued: true });
|
|
141
|
+
}
|
|
142
|
+
catch (error) {
|
|
143
|
+
console.error('Mining Logic Error:', error);
|
|
144
|
+
res.status(500).json({ error: error.message });
|
|
145
|
+
}
|
|
146
|
+
});
|
|
147
|
+
// Browser path detection
|
|
148
|
+
app.get('/api/browser-paths/detect', async (req, res) => {
|
|
149
|
+
try {
|
|
150
|
+
const detector = new BrowserPathDetector();
|
|
151
|
+
const results = detector.detectAll();
|
|
152
|
+
res.json(results);
|
|
153
|
+
}
|
|
154
|
+
catch (error) {
|
|
155
|
+
res.status(500).json({ error: error.message });
|
|
156
|
+
}
|
|
157
|
+
});
|
|
158
|
+
// Browser path validation
|
|
159
|
+
app.post('/api/browser-paths/validate', async (req, res) => {
|
|
160
|
+
const { path: filePath } = req.body;
|
|
161
|
+
if (!filePath) {
|
|
162
|
+
return res.status(400).json({ error: 'Path is required' });
|
|
163
|
+
}
|
|
164
|
+
try {
|
|
165
|
+
const detector = new BrowserPathDetector();
|
|
166
|
+
const result = detector.validateSQLitePath(filePath);
|
|
167
|
+
res.json(result);
|
|
168
|
+
}
|
|
169
|
+
catch (error) {
|
|
170
|
+
res.status(500).json({ error: error.message });
|
|
171
|
+
}
|
|
172
|
+
});
|
|
173
|
+
app.post('/api/test/analyze', async (req, res) => {
|
|
174
|
+
const { text } = req.body;
|
|
175
|
+
try {
|
|
176
|
+
let config = {
|
|
177
|
+
baseUrl: CONFIG.OLLAMA_HOST,
|
|
178
|
+
model: 'llama3',
|
|
179
|
+
apiKey: ''
|
|
180
|
+
};
|
|
181
|
+
if (SupabaseService.isConfigured()) {
|
|
182
|
+
const supabase = SupabaseService.getServiceRoleClient();
|
|
183
|
+
const { data: userData } = await supabase.rpc('get_any_user_id');
|
|
184
|
+
if (userData) {
|
|
185
|
+
const { data: settings } = await supabase
|
|
186
|
+
.from('alchemy_settings')
|
|
187
|
+
.select('*')
|
|
188
|
+
.eq('user_id', userData)
|
|
189
|
+
.single();
|
|
190
|
+
if (settings) {
|
|
191
|
+
config = {
|
|
192
|
+
baseUrl: settings.llm_base_url || settings.ollama_host || CONFIG.OLLAMA_HOST,
|
|
193
|
+
model: settings.llm_model_name || 'llama3',
|
|
194
|
+
apiKey: settings.llm_api_key || settings.openai_api_key || settings.anthropic_api_key || ''
|
|
195
|
+
};
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
const result = await alchemist.analyzeSignal(text, config);
|
|
200
|
+
res.json(result);
|
|
201
|
+
}
|
|
202
|
+
catch (error) {
|
|
203
|
+
res.status(500).json({ error: error.message });
|
|
204
|
+
}
|
|
205
|
+
});
|
|
206
|
+
// Test LLM Connection
|
|
207
|
+
app.post('/api/llm/test', async (req, res) => {
|
|
208
|
+
const { baseUrl, modelName, apiKey } = req.body;
|
|
209
|
+
try {
|
|
210
|
+
const result = await alchemist.testConnection({
|
|
211
|
+
baseUrl,
|
|
212
|
+
model: modelName,
|
|
213
|
+
apiKey
|
|
214
|
+
});
|
|
215
|
+
res.json(result);
|
|
216
|
+
}
|
|
217
|
+
catch (error) {
|
|
218
|
+
res.status(500).json({ success: false, message: error.message });
|
|
219
|
+
}
|
|
220
|
+
});
|
|
221
|
+
// Unified Static Assets Serving
|
|
222
|
+
const staticPath = process.env.ELECTRON_STATIC_PATH || path.join(__dirname, '..', '..', 'dist');
|
|
223
|
+
if (fs.existsSync(staticPath)) {
|
|
224
|
+
console.log(`[Alchemy] Serving UI from ${staticPath}`);
|
|
225
|
+
app.use(express.static(staticPath));
|
|
226
|
+
// Client-side routing fallback (Bypass path-to-regexp error in Express 5)
|
|
227
|
+
app.use((req, res, next) => {
|
|
228
|
+
if (!req.path.startsWith('/api') && !req.path.startsWith('/events')) {
|
|
229
|
+
return res.sendFile(path.join(staticPath, 'index.html'));
|
|
230
|
+
}
|
|
231
|
+
next();
|
|
232
|
+
});
|
|
233
|
+
}
|
|
234
|
+
const PORT = CONFIG.PORT;
|
|
235
|
+
app.listen(PORT, () => {
|
|
236
|
+
console.log(`[Alchemy API] Running on port ${PORT} (${process.platform})`);
|
|
237
|
+
});
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
export class ContentCleaner {
|
|
2
|
+
/**
|
|
3
|
+
* Cleans content by removing noise, boilerplate, and ensuring safe tokens.
|
|
4
|
+
* Optimized for LLM processing.
|
|
5
|
+
*/
|
|
6
|
+
static clean(text) {
|
|
7
|
+
if (!text)
|
|
8
|
+
return "";
|
|
9
|
+
const originalText = text;
|
|
10
|
+
// 1. Detect if content is actually HTML
|
|
11
|
+
const isHtml = /<[a-z][\s\S]*>/i.test(text);
|
|
12
|
+
if (isHtml) {
|
|
13
|
+
// Lightweight HTML -> Markdown Conversion
|
|
14
|
+
// Structure: <br>, <p> -> Newlines
|
|
15
|
+
text = text.replace(/<br\s*\/?>/gi, '\n');
|
|
16
|
+
text = text.replace(/<\/p>/gi, '\n\n');
|
|
17
|
+
text = text.replace(/<p.*?>/gi, '');
|
|
18
|
+
// Structure: Headers <h1>-<h6> -> # Title
|
|
19
|
+
text = text.replace(/<h[1-6].*?>(.*?)<\/h[1-6]>/gsi, (match, p1) => `\n# ${p1}\n`);
|
|
20
|
+
// Structure: Lists <li> -> - Item
|
|
21
|
+
text = text.replace(/<li.*?>(.*?)<\/li>/gsi, (match, p1) => `\n- ${p1}`);
|
|
22
|
+
text = text.replace(/<ul.*?>/gi, '');
|
|
23
|
+
text = text.replace(/<\/ul>/gi, '\n');
|
|
24
|
+
// Links: <a href=\"...\">text</a> -> [text](href)
|
|
25
|
+
text = text.replace(/<a\s+(?:[^>]*?\s+)?href=\"([^\"]*)\"[^>]*>(.*?)<\/a>/gsi, (match, href, content) => `[${content}](${href})`);
|
|
26
|
+
// Images: <img src=\"...\" alt=\"...\"> -> 
|
|
27
|
+
text = text.replace(/<img\s+(?:[^>]*?\s+)?src=\"([^\"]*)\"(?:[^>]*?\s+)?alt=\"([^\"]*)\"[^>]*>/gsi, (match, src, alt) => ``);
|
|
28
|
+
// Style/Script removal (strictly remove content)
|
|
29
|
+
text = text.replace(/<script.*?>.*?<\/script>/gsi, '');
|
|
30
|
+
text = text.replace(/<style.*?>.*?<\/style>/gsi, '');
|
|
31
|
+
// Final Strip of remaining tags
|
|
32
|
+
text = text.replace(/<[^>]+>/g, ' ');
|
|
33
|
+
// Entity decoding (Basic)
|
|
34
|
+
text = text.replace(/ /gi, ' ');
|
|
35
|
+
text = text.replace(/&/gi, '&');
|
|
36
|
+
text = text.replace(/</gi, '<');
|
|
37
|
+
text = text.replace(/>/gi, '>');
|
|
38
|
+
text = text.replace(/"/gi, '"');
|
|
39
|
+
text = text.replace(/'/gi, "'");
|
|
40
|
+
}
|
|
41
|
+
const lines = text.split('\n');
|
|
42
|
+
const cleanedLines = [];
|
|
43
|
+
// Patterns that usually mark the START of a reply chain or a generic footer
|
|
44
|
+
const truncationPatterns = [
|
|
45
|
+
/^On .* wrote:$/i,
|
|
46
|
+
/^From: .* <.*>$/i,
|
|
47
|
+
/^-----Original Message-----$/i,
|
|
48
|
+
/^________________________________$/i,
|
|
49
|
+
/^Sent from my iPhone$/i,
|
|
50
|
+
/^Sent from my Android$/i,
|
|
51
|
+
/^Get Outlook for/i,
|
|
52
|
+
/^--$/ // Standard signature separator
|
|
53
|
+
];
|
|
54
|
+
// Patterns for lines that should be stripped but NOT truncate the whole email
|
|
55
|
+
const noisePatterns = [
|
|
56
|
+
/view in browser/i,
|
|
57
|
+
/click here to view/i,
|
|
58
|
+
/legal notice/i,
|
|
59
|
+
/all rights reserved/i,
|
|
60
|
+
/privacy policy/i,
|
|
61
|
+
/terms of service/i,
|
|
62
|
+
/unsubscribe/i
|
|
63
|
+
];
|
|
64
|
+
for (let line of lines) {
|
|
65
|
+
let lineStripped = line.trim();
|
|
66
|
+
if (!lineStripped) {
|
|
67
|
+
cleanedLines.push("");
|
|
68
|
+
continue;
|
|
69
|
+
}
|
|
70
|
+
// 2. Quoted text removal (lines starting with >)
|
|
71
|
+
if (lineStripped.startsWith('>')) {
|
|
72
|
+
continue;
|
|
73
|
+
}
|
|
74
|
+
// 3. Truncation check: If we hit a reply header, we stop entirely
|
|
75
|
+
let shouldTruncate = false;
|
|
76
|
+
for (const pattern of truncationPatterns) {
|
|
77
|
+
if (pattern.test(lineStripped)) {
|
|
78
|
+
shouldTruncate = true;
|
|
79
|
+
break;
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
if (shouldTruncate)
|
|
83
|
+
break;
|
|
84
|
+
// 4. Noise check: Strip boilerplate lines
|
|
85
|
+
let isNoise = false;
|
|
86
|
+
if (lineStripped.length < 100) {
|
|
87
|
+
for (const pattern of noisePatterns) {
|
|
88
|
+
if (pattern.test(lineStripped)) {
|
|
89
|
+
isNoise = true;
|
|
90
|
+
break;
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
if (isNoise)
|
|
95
|
+
continue;
|
|
96
|
+
cleanedLines.push(line);
|
|
97
|
+
}
|
|
98
|
+
// Reassemble
|
|
99
|
+
text = cleanedLines.join('\n');
|
|
100
|
+
// Collapse whitespace
|
|
101
|
+
text = text.replace(/\n{3,}/g, '\n\n');
|
|
102
|
+
text = text.replace(/[ \t]{2,}/g, ' ');
|
|
103
|
+
// Safety Fallback: If cleaning stripped too much, return original text truncated
|
|
104
|
+
if (text.trim().length < 20 && originalText.trim().length > 20) {
|
|
105
|
+
return originalText.substring(0, 3000).trim();
|
|
106
|
+
}
|
|
107
|
+
// Sanitize LLM Special Tokens
|
|
108
|
+
text = text.replace(/<\|/g, '< |');
|
|
109
|
+
text = text.replace(/\|>/g, '| >');
|
|
110
|
+
text = text.replace(/\[INST\]/gi, '[ INST ]');
|
|
111
|
+
text = text.replace(/\[\/INST\]/gi, '[ /INST ]');
|
|
112
|
+
return text.trim();
|
|
113
|
+
}
|
|
114
|
+
}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|